#--------------------------------------------------------------------------------#

# Wuhan Wenzheng website data as evidence ####

#--------------------------------------------------------------------------------#

# Authors: WANG, Howard; CHENG, Edmund; CHEN, Xi; LIANG, Hai
# Version: 2024-Oct-30
# R version 4.4.1 (2024-06-14)
# Running under: macOS 15.1


rm(list = ls())

#Set directory#
(code_file_path = rstudioapi::getActiveDocumentContext()$path) # Get current path of this code file
setwd(dirname(code_file_path)) # dirname(): reach the upper level folder
getwd() # current directory

#Set packages#
#install.packages("quanteda")
library(quanteda) #version 3.2.3
#install.packages("jiebaR")
library(jiebaR) #version 0.11
#install.packages("stm")
library(stm) #version 1.3.6
#install.packages("lubridate")
library(lubridate) #version 1.8.0
#install.packages("dplyr")
library(dplyr) #version 1.0.10
#install.packages("ggplot2")
library(ggplot2) #version 3.4.0
#install.packages("extrafont")
library(extrafont) #version 0.18
library(stringr)

library(lme4) # for using glmer
library(sjPlot) # for tab_model

# Prepare to show Chinese characters
loadfonts()
if (!require("showtext")) { install.packages("showtext")}
library(showtext)
showtext_auto(enable = T)



#--------------------------------------------------------------------------------#
#___________________________----
#--------------------------------------------------------------------------------#
## Figure 7: Reply amount trend####

load(file = "data/wuhan_liuyan_2017to2022.Rdata")
data = wuhan_liuyan_2017to2022
## Time variables
#Time range
class(data$post_ymd)
summary(data$post_ymd) #"2017-04-28" "2022-07-07" @@@
# Min.      1st Qu.       Median         Mean      3rd Qu.         Max.         NA's 
# "2017-04-28" "2019-01-08" "2020-07-26" "2020-03-30" "2021-05-27" "2022-07-07"       "5444" 

# post time
data$post_ym=ym(paste0(data$post_year,"-",data$post_month)) # 5444 failed to parse
data = data |> subset(!is.na(post_ym))

# reply time
data$reply_ym=ym(paste0(data$reply_year,"-",data$reply_month)) # 5444 failed to parse

# reply speed
data$reply_days_taken = data$reply_ymd - data$post_ymd



# daily 
summary(data$reply_ymd)#"2017-04-28","2022-07-13"
day_reply=data|>group_by(reply_ymd)|>summarize(reply_n_day=n())
(average <- ceiling(mean(day_reply$reply_n_day, na.rm = TRUE)) )

p_day <- day_reply |>
  ggplot(aes(reply_ymd, reply_n_day)) +
  geom_line() +
  theme_classic() +
  xlab("Day (28 Apr. 2017 - 13 Jul. 2022)") +
  ylab("Reply Number") +
  theme(
    axis.text.x = element_text(angle = 45, size = 10, margin = margin(t = 10)),  # Adjust margin
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank()
  ) +
  scale_x_date(
    date_labels = "%b%Y",
    date_breaks = "90 days",
    limits = c(ymd("2017-04-01"), ymd("2022-08-01"))
  ) +
  geom_vline(xintercept = ymd(20200129), linetype = "dotted", color = "red") +
  annotate("text", x = ymd(20200116), y = 1000, label = "Wuhan Lockdown", colour = "red", size = 5, angle = 90) +
  geom_vline(xintercept = ymd(20200408), linetype = "dotted", color = "red") +
  annotate("text", x = ymd(20200325), y = 1000, label = "Wuhan Lifting", colour = "red", size = 5, angle = 90) +
  geom_vline(xintercept = ymd(20220228), linetype = "dotted", color = "red") +
  annotate("text", x = ymd(20220215), y = 1000, label = "Shanghai Lockdown", colour = "red", size = 5, angle = 90) +
  ylim(1, 1500) + 
  geom_hline(yintercept = average, linetype = "dashed", color = "grey") +
  annotate("text", x = as.Date("2017-04-01"), y = average + 50, 
           label = paste("Mean =", average), color = "black", size = 4, hjust = 0) + 
  geom_rect(xmin = as.Date("2019-01-01"), xmax = as.Date("2019-12-31"), ymin = -Inf, ymax = Inf,
            fill = "yellow", alpha = 0.002)



# year-month trend
summary(data$reply_ym)#"2017-04-01","2022-07-01"
ym_reply=data|>group_by(reply_ym)|>summarize(reply_n=n()) # aggregate
(average <- ceiling(mean(ym_reply$reply_n, na.rm = TRUE)) )

p_month <- ym_reply |>
  ggplot(aes(reply_ym, reply_n)) +
  geom_line() +
  theme_classic() +
  xlab("Month (Apr. 2017 - Jul. 2022)") +
  ylab("Reply Number") +
  theme(
    axis.text.x = element_text(angle = 45, size = 10, margin = margin(t = 10)),  # Adjust margin
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank()
  ) +
  scale_x_date(
    date_labels = "%b%Y",
    date_breaks = "3 months",
    limits = c(ymd("2017-04-01"), ymd("2022-08-01"))
  ) +
  geom_vline(xintercept = ymd(20200129), linetype = "dotted", color = "red") +
  annotate("text", x = ymd(20200116), y = 20000, label = "Wuhan Lockdown", color = "red", size = 5, angle = 90) +
  geom_vline(xintercept = ymd(20200408), linetype = "dotted", color = "red") +
  annotate("text", x = ymd(20200325), y = 20000, label = "Wuhan Lifting", color = "red", size = 5, angle = 90) +
  geom_vline(xintercept = ymd(20220228), linetype = "dotted", color = "red") +
  annotate("text", x = ymd(20220215), y = 20000, label = "Shanghai Lockdown", color = "red", size = 5, angle = 90) +
  ylim(1, 30000) +
  geom_hline(yintercept = average, linetype = "dashed", color = "grey") +
  annotate("text", x = as.Date("2017-04-01"), y = average + 1000, 
           label = paste("Mean =", average), color = "black", size = 4, hjust = 0) +
  geom_rect(xmin = as.Date("2019-01-01"), xmax = as.Date("2019-12-31"), ymin = -Inf, ymax = Inf,
            fill = "yellow", alpha = 0.002)


p_wuhan = ggpubr::ggarrange(p_day, p_month, ncol=1, nrow=2, labels = c("A","B"), 
          font.label = list(size = 12, color = "blue", face = "bold"))

ggsave(plot = p_wuhan, "output/main/Figure 7 Wuhan online petition, day and month reply.jpg", width = 12, height = 7, dpi = 100, quality = 1000)



###---
#--------------------------------------------------------------------------------#
#___________________________----
#--------------------------------------------------------------------------------#
# Focusing on 2019 data: Wuhan online petition analysis ####

load(file = "data/wuhan_liuyan_2019.Rdata")
d = wuhan_liuyan_2019

## Reply quality ######
d$reply_content = d$answersList_asContent |> str_remove_all("\\s|<p><br></p>")
d$reply_content[1:6]
## Referral: it includes a particular telephone number or landline number
d$referral = as.numeric(str_detect(d$reply_content, "([0-9]{0,4}-?[0-9]{7,8})|([0-9]{11})|(请您联系)|(请联系)|(请致电)|(请拨打)" ) )
summary(d$referral) # mean = 0.06817

## Direct information: it contains exact numbers before Yuan (元) or Wan Yuan (万元) or other precise contents
d$direct_info = as.numeric(str_detect(d$reply_content, "([0-9]日)|([0-9]号)|([0-9]分)|([0-9]点)|([0-9]元)|([0-9]万)|([0-9]人)|([0-9]米)|([0-9]多米)|([0-9]千米)|([0-9]平方)|([0-9]万平米)|([0-9]户)|([0-9]家)|([0-9]度)|([0-9]摄氏度)|([0-9]℃)|([0-9]条)|([0-9]%)") ) 
summary(d$direct_info) # 0.5203


## action  
d$action = as.numeric(str_detect(d$reply_content, "(已向您反馈|拨打您|拨打你|电话沟通|工作人员联系|已向你联系|已向您联系|已联系您|已联系你|与您进行联系|已与您电话|取得联系|与你联系|与您联系|电话联系|电话回复|与您沟通|通过电话|电话告知您|电话告知你|进行了沟通|详细解答|回访)|(表示满意|表示知晓|表示感谢|表示理解|表示认可|您家人表示认可|表示知悉|表示已知晓|表示已知悉|表示同意|表示愿意)|(情况属实|进行了核实|进行了核查|进行了调查|实地调查)|(已经得到解决|已得到解决|现已解决|现已要求|进行了拆除|进行了约谈|已责令|写保证书|承诺|[0-9]日前发放|[0-9]日发放|[0-9]号前发放|[0-9]号发放|到您家中|现场|入户查看|上门|现场走访|面谈|当面)|(约谈|惩罚|经协调|会同|已解決|已经解决|已退款|已经退款|已妥善解決|退款成功|已全额退款)") )
summary(d$action) # 0.5591

# resolved
d$resolved = as.numeric(str_detect(d$reply_content, "(表示满意|表示感谢|表示理解|表示认可|表示同意|表示愿意)|(已经得到解决|已得到解决|已解决|已经解决|进行了拆除|进行了约谈|已责令|写保证书|承诺|[0-9]日前发放|[0-9]日发放|[0-9]号前发放|[0-9]号发放|已退款|已妥善解決|退款成功|已全额退款)") )
summary(d$resolved ) # 0.08722


# concrete
d$concrete = ifelse(d$direct_info==0 & d$action==0, 0, 1) # @@@ At least one of the two is equal to 1, then the reply is defined as concrete.
table(d$concrete) # 0: 42080 1: 115141 
summary(d$concrete) # 0.7324

# concrete_broad
d$concrete_broad = ifelse(d$referral==0 & d$direct_info==0 & d$action==0, 0, 1) 

## Complaint (measured by regular expression, short for re) #####
# Using regular expression to measured complaint_re
d$complaint_re_temp = d$content |> str_extract_all("投诉|反映问题|不满|申诉|抗议")
d$complaint_re_n = sapply(1:nrow(d), function(i) length(d$complaint_re_temp[i][[1]]) )
d$complaint_re = ifelse(d$complaint_re_n>0,1,0)
check = d|> select(content, complaint_re_temp, complaint_re_n, complaint_re)
table(d$complaint_re)[2]/nrow(d) # 0.08993709

## Post controls #####
## Post length
d$post_length = nchar(d$content) # "nchar()" calculates the length of a string. One Chinese character also indicates one length. 
table(is.na(d$post_length )) 
summary(d$post_length ) 
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 3.0    63.0   110.0   159.6   194.0  2890.0 
d$post_length_log = log(d$post_length)

## Leader controls #####
# fid
check = table(d$fid) |> as.data.frame()


## Reply controls ####
## Reply length
d$reply_length = nchar(d$reply_content) # "nchar()" calculates the length of a string. One Chinese character also indicates one length. 
table(is.na(d$reply_length )) 
summary(d$reply_length ) 
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 14.0   126.0   190.0   239.6   294.0 11114.0 
check = d|>subset(reply_length<=20)
d$reply_length_log = log(d$reply_length)

## reply speed
d$replied_days = ymd(d$reply_ymd) - ymd(d$post_ymd) # speed
d$replied_days = d$replied_days |> as.character() |> str_remove_all("[a-z]") |> as.numeric()
summary(d$replied_days)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.000   3.000   6.000   5.986   8.000 926.000 


## Negative information collection #####
## sentiment of claims 
cutter= jiebaR::worker(stop_word = "data/stopword.txt", symbol = F, bylines = T) # read the "stopword" file, and generate a cutter.
text = d$content # The text of original post is the main objective. 
results_all <- jiebaR::segment(text, cutter) # apply cutter to text. 
text <- as.data.frame(sapply(sapply(results_all, function(x){gsub("([a-zA-Z.0-9]+)|(领导|书记|现在)", "", x)}), function(x){paste(x, collapse = " ")}),stringsAsFactors = FALSE) #combine the segmented things
text[text== "" | text == "  " | text == ". "] <- NA
d$post_segmented <- text[,1] 

d$content[1:6]
d$post_segmented[1:6]

negative_dictionary <- readLines("data/negative.txt", warn = FALSE) |> str_remove_all("\\s| ") |> unique()
length(negative_dictionary) # 11670

f_negative <- function(text, dictionary) {
  text <- str_trim(text)
  words <- unlist(strsplit(text, " ")) # Split text into words
  neg_words <- words %in% dictionary # Count the number of negative words
  score <- sum(neg_words)
  return(score)
}
d$post_negative_n <- sapply(d$post_segmented, f_negative, dictionary=negative_dictionary) # it takes 3 minutes.
table(d$post_negative_n)

# To standardized values to range from 0 to 1, use min-max normalization:
min <- min(d$post_negative_n, na.rm = TRUE)
max <- max(d$post_negative_n, na.rm = TRUE)
d$normalized_negative_score <- (d$post_negative_n - min) / (max - min)
check = d|> select(content,post_segmented,post_negative_n,normalized_negative_score)
hist(d$post_negative_n)
hist(d$normalized_negative_score)

## Performing citizenship #####

### Subjecthood citizenship
# “Respected Mayor” (zunjingde shizhang), “The Great Personage of the Mayor” (shizhang daren), “help” (bangzhu), “take charge of” (zuozhu), “peasant worker” (nongmingong), disadvantaged group (ruoshi qunti), 
# loves the people，pitfull, mercy
d$subject_temp = d$content |> str_extract_all("尊敬的领导|尊敬的市委书记|书记大人|青天大老爷|包青天|爱民|父母官|帮助|做主|作主|农民工|弱势群体|可怜|苦不堪言|度日如年|生活困难|生活困苦|恩惠")
d$subject_n = sapply(1:nrow(d), function(i) length(d$subject_temp[i][[1]]) )
d$subject_citizenship = ifelse(d$subject_n>0,1,0)
summary(d$subject_citizenship) # 0.05862


### Legal Citizenship 
# "We estimate that 14% of the public transcripts we collected reference the law or legal rights." (Distelhorst and Fu 2019, 114))
# rights (quanli)，another document, casually hoodwink, enforce/implement the law, violations of law, illegal, illegal activities, lawful rights and interests.
# open, equal, and transparent project, do you dare tell the truth, legal, legal basis,treat everyone equally, Is this fair, the same as, the principle of equality, law-abiding, legal consciousness
d$legal_temp = d$content |> str_extract_all("权利|维权|维护权利|文件|颁布|发布|条款|规定|法律|法规|规章|随意欺骗|欺上瞒下|欺瞒|欺骗|执行法律|落实政策|落实法律|违法|违反法律|非法的|非法行为|非法活动|权利与利益|法律赋予的|依法|公开|平等|透明|真相|法律基础|法治|法制|公平对待每个人|一视同仁|公平|公正|同样|无差别|无区别|同等情况|同等条件|一模一样|一样|守法|遵守法律|遵纪守法|法治意识")
d$legal_n = sapply(1:nrow(d), function(i) length(d$legal_temp[i][[1]]) )
d$legal_citizenship = ifelse(d$legal_n>0,1,0)
summary(d$legal_citizenship) # 0.1658


### socialist citizenship
# basic social welfare needs, reciprocity, wellbeing, subsistence and development, food, water, shelter, healthcare, and education
# “return to us” (huangei women), return a harmonious living environment to the residents, return to our children a safe and happy childhood, return a safe intersection, 
# Return a safe living environment to the people!, quench the anger of the people (ping minfen), develop the local economy,a genocidal killer (miezu de shashou)
# personally contact the media, I’m not ruling out going to a journalist to complain or using other channels, petition, Next time who will you call to action? Where is heaven’s justice (tian li he zai), 
# perfunctorily, talking about military tactics on paper (zhishangtanbing), not even caring about our livelihoods? How can we live, received bribes,publish this letter, “serving the people” (wei renmin fuwu), “virtuous rule” (dezhi) 
d$socialist_temp = d$content |> str_extract_all("基本社会福利|基本生活保障|物质保障|发展|食物|水|住宿|健康|医疗|教育|衣食住行|还给我们|还我们|还我|平民愤|平息众怒|发展经济|发展地方经济|民族的杀手|汉奸|民族的罪人|历史罪人|联系媒体|记者|信访|号召谁|动员水|天理何在|敷衍|纸上谈兵|关心民生|民生|生计|受贿|公开民意|民意|群众|为人民服务|德治")
d$socialist_n = sapply(1:nrow(d), function(i) length(d$socialist_temp[i][[1]]) )
d$socialist_citizenship = ifelse(d$socialist_n>0,1,0)
summary(d$socialist_citizenship) # 0.2717

check = d|> select(content, subject_temp, subject_n, subject_citizenship,legal_temp,legal_n, legal_citizenship, socialist_temp, socialist_n,socialist_citizenship)


## Risk: Protest + 12345 hotline #####
## Protest
d$protest_keywords = d$content |> str_extract_all("业主|维权|农民工|血汗钱|堵路|特警|拖欠|抗议|讨薪|横幅|罢工|游行|拆迁|执法|暴力|强拆|围堵|拦路|示威|镇政府|县政府|区政府|政府门口|政府门前|黑心|黑社会|勾结|强制|污染|信访|抓走|拖欠工资|围观|主持公道|欠薪|环保|镇压|权益|诉求|王法|诈骗|真相|孩子|强征|申冤|律师|访民|为民做主|维稳|静坐|征地")
d$protest_keywords_n = sapply(1:nrow(d), function(i) length(d$protest_keywords[i][[1]]) )
d$protest_keywords = sapply(1:nrow(d), function(i) paste0(d$protest_keywords[i][[1]], collapse = " ") )
d$protest_risk = ifelse(d$protest_keywords_n>0,1,0)
table(d$protest_risk)[2]/nrow(d) # 0.3026949

## 12345 hotline risk 
d$hotline12345 = d$content |> str_extract_all("12345市长热线|12345热线|市民热线|市长热线|政务热线|12345")
d$hotline12345_n = sapply(1:nrow(d), function(i) length(d$hotline12345[i][[1]]) )
d$hotline12345_collapse = sapply(1:nrow(d), function(i) paste0(d$hotline12345[i][[1]], collapse = " ") )
d$from_hotline12345 = ifelse(d$hotline12345_n>0,1,0)
table(d$from_hotline12345)[2]/nrow(d) # 0.01621921


#--------------------------------------------------------------------------------#
#___________________________----
#--------------------------------------------------------------------------------#

# df: STM and Topic issue ####
df = d

## define text
cutter= jiebaR::worker(stop_word = "data/stopword.txt", symbol = F, bylines = T) # read the "stopword" file, and generate a cutter.
text = df$content # The text of original post is the main objective.

## Segment the text
results_all <- segment(text, cutter) # apply cutter to text.
text <- as.data.frame(sapply(sapply(results_all, function(x){gsub("([a-zA-Z.0-9]+)|(领导|书记|现在)", "", x)}), function(x){paste(x, collapse = " ")}),stringsAsFactors = FALSE) #combine the segmented things

## Construct the document matrix and affiliate the metadata
df$text <- text[,1] # treat text as a variable of df
corpus1 <- corpus(df$text, docvars = df) 
tokens1 <- tokens(corpus1) 
dfm<- dfm(tokens1)
dim(dfm) # 157221  37085 
out <- quanteda::convert(dfm, to = "stm", docvars = df)

# construct the stm model

k=20
# This make take a little bit long to run
###---
# wuhan_topic_model <- stm::stm(documents = out$documents, # documents can be dfm.
#                               vocab = out$vocab, # Character vector specifying the words in the corpus in the order of the vocab indices in documents.
#                               K = k, # If init.type="Spectral" you can also set K=0 to use the algorithm of Lee and Mimno (2014) to set the number of topics (although unlike the standard spectral initialization this is not deterministic)
#                               prevalence = ~ complaint_re+s(post_month), # If one of the variables do not exist, it will report errors.
#                               content = NULL,
#                               init.type = "Spectral", # Among c("Spectral", "LDA", "Random", "Custom"), Spectral is the default.
#                               seed = 12345, # Seed for the random number generator. stm saves the seed it uses on every run so that any result can be exactly reproduced. When attempting to reproduce a result with that seed, it should be specified here.
#                               max.em.its=75, # default is 500. # The maximum number of EM iterations. If convergence has not been met at this, point, a message will be printed. If you set this to 0 it will return the initialization.
#                               verbose = TRUE, # A logical flag indicating whether information should be printed to the screen.
#                               emtol = 1e-3, # emtol = 1e-3 to match the Blei convergence tolerance. Compare to the default emtol, 1e-3 will only use half of the time.
#                               data = out$meta) # an optional data frame containing the prevalence and/or content covariates.
# 
# save(wuhan_topic_model,file = "data/stm/wuhan_topic_model.Rdata")
###---


load(file = "data/stm/wuhan_topic_model.Rdata")
summary(wuhan_topic_model) 
# A topic model with 20 topics, 157221 documents and a 37085 word dictionary.

# Manually label the topics
topic_labels = paste0("Topic ",
                      c("1: Education", 
                        "2: Noise, sleep", 
                        "3: Unkonwn1",
                        "4: Traffic", 
                        "5: Daily fee, price", 
                        "6: Policing, public security",
                        "7: Hospital, vaccine",
                        "8: Transportation, road",
                        "9: Construction",
                        "10: Public service, tourism",
                        "11: Consumption, shop",
                        "12: Public bus, commute",
                        "13: Realestate",
                        "14: Unkonwn2",
                        "15: Labor, salary",
                        "16: Homeonwers",
                        "17: Peasants",
                        "18: Pollution",
                        "19: Subway, train",
                        "20: Unknown3"),
                      "|")

### Figure 8 ####
pdf("output/main/Figure 8 Wuhan stm topic.pdf", family = "GB1", width = 15.5, height = 8) # GB1 
plot(
  wuhan_topic_model,
  type = "summary", # type = c("summary", "labels", "perspectives", "hist"),
  n = 20, # number of keywords
  topics = NULL,
  labeltype="frex", # labeltype = c("prob", "frex", "lift", "score"),
  main = "The Plot of Structural Topic Modeling with Labels",
  xlab = "Proportion of Topics",
  xlim = c(0,0.4),
  family = "GB1", # Font (or STSong)
  width = 100,
  plabels = NULL,
  text.cex = 1,
  topic.names = topic_labels
)
dev.off() 

# Extract topic values from the stm model

# Check the quality of the topic proportion
df_stm = data.frame(topic_score = wuhan_topic_model$theta,
                    content = out$meta$content, 
                    tid = out$meta$queryCode )

names(df_stm) = c(
  "topic1_Education", 
  "topic2_Noise_sleep", 
  "topic3_Unkonwn1",
  "topic4_Traffic", 
  "topic5_Daily_fee_price", 
  "topic6_Policing_public_security",
  "topic7_Hospital_vaccine",
  "topic8_Transportation_road",
  "topic9_Construction",
  "topic10_Public_service_tourism",
  "topic11_Consumption_shop",
  "topic12_Public_bus_commute",
  "topic13_Realestate",
  "topic14_Unkonwn2",
  "topic15_Labor_salary",
  "topic16_Homeonwers",
  "topic17_Peasants",
  "topic18_Pollution",
  "topic19_Subway_train",
  "topic20_Unknown3",
  "content", "queryCode")


#--------------------------------------------------------------------------------#
#___________________________----
#--------------------------------------------------------------------------------#

# dm: Merge the topic proportion score into the original df ####
table(df$queryCode==df_stm$queryCode) # the sequence has not been changed. 

dm = left_join(df, df_stm, by=c("content", "queryCode"))


## Assign label to variables ####
library(labelled)
var_label(dm) <- list(
  # DV
  concrete="Concrete reply (1=yes, 0=no)",concrete_broad="Concrete reply (broad)",	action="Action as reply",	resolved="Problem resolved as reply",
  reply_length="Reply length",reply_length_log="Reply length (log)",
  replied_days = "Reply of days taken", 
  # Treatment variable
  complaint_re ="Complaint (re,1=yes, 0=no)",
  # post controls
  post_length_log= "Post length (log)",
  # Alternative explanations and mechanisms
  normalized_negative_score="Normalized negative score",
  subject_citizenship="Subjecthood citizenship", legal_citizenship="Legal citizenship", socialist_citizenship="Socialist citizenship",
  protest_risk="Claim containing protest risk", from_hotline12345="Claim From 12345 hotline"
)

dm = dm |> select(!contains("_temp"))


# ________________ ----
#--------------------------------------------------------------------------------#
# Analyses on Wuhan ----
#--------------------------------------------------------------------------------#

## Table E1: Variable summary ####
# dm is the cleaned data for regression
dt <- dm %>% 
  dplyr::select(reply_ym, concrete, concrete_broad, action, resolved, 
                complaint_re,  # treatment
                post_length, post_length_log, 
                #topic1_Education,topic2_Noise,topic4_Transportation_light,topic5_Daily_fee,topic6_Policing,topic7_Hospital,topic8_Transportation_road,topic9_Construction,topic10_Public_goods,topic11_Consumption,topic12_Public_bus,topic13_Realestate,topic15_Employment,topic16_Homeonwers,topic17_Peasants,topic18_Pollution,topic19_Subway_train,
                contains("topic"),
                replied_days, reply_length,reply_length_log # reply controls
  )

stargazer::stargazer(dt[,],
                     type = "text", # use "html" or "latex" for output to html or latex
                     label = "tab:desc_stats",
                     digits = 3, # number of decimal places
                     header = FALSE, 
                     summary.stat = c("n", "mean", "median", "sd", "min", "max"),
                     out = "output/appendix/Table F1 Variables summary, Wuhan evidence.html" ) 



## Use district as cluter unit ####
# Treat reply organization as the fixed effect and random effect unit. 
t = table(dm$answersList_organization) |> as.data.frame() 
qu = dm |> subset(answersList_organization |> str_detect("区"))
t2 = table(qu$answersList_organization) |> as.data.frame()

dm$district=NA
# 江岸,江汉,硚口,汉阳,武昌,青山,洪山
# 东西湖,汉南
# 蔡甸, 江夏, 黄陂, 新洲
# 东湖(东湖高新区+东湖新技术开发区管委会), 武汉经济技术开发区管委会
dm = dm |> mutate(
  district = ifelse(answersList_organization=="洪山区", "Hongshanqu", district),
  district = ifelse(answersList_organization %in% c("东湖高新区", "东湖新技术开发区管委会") , "Donghugaoxinqu", district),
  district = ifelse(answersList_organization=="江夏区", "Jiangxiaqu", district),
  district = ifelse(answersList_organization=="汉阳区", "Hanyangqu", district),
  district = ifelse(answersList_organization=="黄陂区", "Huangpiqu", district),
  district = ifelse(answersList_organization=="江岸区", "Jianganqu", district),
  district = ifelse(answersList_organization=="武昌区", "Wuchangqu", district),
  district = ifelse(answersList_organization=="东西湖区", "Dongxihuqu", district),
  district = ifelse(answersList_organization=="江汉区", "Jianghanqu", district),
  district = ifelse(answersList_organization=="武汉经济技术开发区管委会", "Jingkaiqu", district),
  district = ifelse(answersList_organization=="硚口区", "Qiaokouqu", district),
  district = ifelse(answersList_organization=="新洲区", "Xinzhouqu", district),
  district = ifelse(answersList_organization=="蔡甸区", "Caidianqu", district),
  district = ifelse(answersList_organization=="青山区", "Qingshanqu", district)
)
check1 = table(dm$district)|>as.data.frame()

t=dm |> subset(answersList_organization %in% c("洪山区", "东湖高新区", "东湖新技术开发区管委会", "江夏区", "汉阳区", "黄陂区", "江岸区", "武昌区", "东西湖区", "江汉区", "武汉经济技术开发区管委会", "硚口区", "新洲区", "蔡甸区", "青山区") )
check2=table(t$answersList_organization)|>as.data.frame()


## Topic detected ####
threshold <- 0.5
# List of topic column names
topic_columns <- c(
  "topic1_Education", "topic2_Noise_sleep", "topic3_Unkonwn1", "topic4_Traffic", 
  "topic5_Daily_fee_price", "topic6_Policing_public_security", "topic7_Hospital_vaccine", 
  "topic8_Transportation_road", "topic9_Construction", "topic10_Public_service_tourism", 
  "topic11_Consumption_shop", "topic12_Public_bus_commute", "topic13_Realestate", 
  "topic14_Unkonwn2", "topic15_Labor_salary", "topic16_Homeonwers", "topic17_Peasants", 
  "topic18_Pollution", "topic19_Subway_train", "topic20_Unknown3"
)

# Convert probabilities to binary indicators
for (topic in topic_columns) {
  binary_topic <- paste0(topic, "_binary")  # Create a new column name for binary indicators
  dm[[binary_topic]] <- ifelse(dm[[topic]] > threshold, 1, 0)
}
t = dm |> select(contains("topic"))
binary_topic_columns <- paste0(topic_columns, "_binary")
binary_topic_string <- paste(binary_topic_columns, collapse = "+")
cat(binary_topic_string)

## Table 4 ####

summary(wuhan1 <- glmer(concrete~complaint_re+
                          # Thread controls
                          post_length_log+topic1_Education+topic2_Noise_sleep+topic4_Traffic+topic5_Daily_fee_price+topic6_Policing_public_security+topic7_Hospital_vaccine+topic8_Transportation_road+topic9_Construction+topic10_Public_service_tourism+topic11_Consumption_shop+topic12_Public_bus_commute+topic13_Realestate+topic15_Labor_salary+topic16_Homeonwers+topic17_Peasants+topic18_Pollution+topic19_Subway_train+
                          # Reply controls
                          replied_days+reply_length_log+
                          (1|reply_month)+(complaint_re|reply_month),
                        dm,
                        nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


# concrete broad
summary(wuhan2 <- glmer(concrete_broad~complaint_re+
                          # Thread controls
                          post_length_log+topic1_Education+topic2_Noise_sleep+topic4_Traffic+topic5_Daily_fee_price+topic6_Policing_public_security+topic7_Hospital_vaccine+topic8_Transportation_road+topic9_Construction+topic10_Public_service_tourism+topic11_Consumption_shop+topic12_Public_bus_commute+topic13_Realestate+topic15_Labor_salary+topic16_Homeonwers+topic17_Peasants+topic18_Pollution+topic19_Subway_train+
                          # Reply controls
                          replied_days+reply_length_log+ 
                          (1|reply_month)+(complaint_re|reply_month),
                        dm,
                        nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

# action
summary(wuhan3 <- glmer(action~complaint_re+
                          # Thread controls
                          post_length_log+topic1_Education+topic2_Noise_sleep+topic4_Traffic+topic5_Daily_fee_price+topic6_Policing_public_security+topic7_Hospital_vaccine+topic8_Transportation_road+topic9_Construction+topic10_Public_service_tourism+topic11_Consumption_shop+topic12_Public_bus_commute+topic13_Realestate+topic15_Labor_salary+topic16_Homeonwers+topic17_Peasants+topic18_Pollution+topic19_Subway_train+
                          # Reply controls
                          replied_days+reply_length_log+  
                          (1|reply_month)+(complaint_re|reply_month),
                        dm,
                        nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

# resolved
summary(wuhan4 <- glmer(resolved~complaint_re+
                          # Thread controls
                          post_length_log+topic1_Education+topic2_Noise_sleep+topic4_Traffic+topic5_Daily_fee_price+topic6_Policing_public_security+topic7_Hospital_vaccine+topic8_Transportation_road+topic9_Construction+topic10_Public_service_tourism+topic11_Consumption_shop+topic12_Public_bus_commute+topic13_Realestate+topic15_Labor_salary+topic16_Homeonwers+topic17_Peasants+topic18_Pollution+topic19_Subway_train+
                          # Reply controls
                          replied_days+reply_length_log+
                          (1|reply_month)+(complaint_re|reply_month),
                        dm,
                        nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


sjPlot::tab_model(wuhan1,wuhan2,wuhan3,wuhan4,
                  transform=NULL, # not showing odds ratio
                  show.ci=F, show.se=T, collapse.se = T, p.style="stars", # one column (p: numeric, stars (asterisk), scientific, numeric_stars)
                  digits=2, digits.p=2,
                  terms=c("complaint_re"), # Only present the interested predictors
                  show.r2= F, show.aic=T,show.loglik=F,show.dev=F,show.obs=T,
                  file= "output/main/Table 4 Wuhan evidence.html")  



## Figure F1 #####
theme_set(theme_sjplot()) # set the format of graph 
# Concrete reply as DV
pdf("output/appendix/Figure F1 Visualization of Model 1 in Table 4.pdf", width = 10, height = 7)
plot_model(wuhan1, transform = NULL,
           vline.color = "grey",
           show.values = TRUE, value.offset = .3, value.size = 3,
           dot.size = 0.6,line.size = 0.3,
           width = 0.3,
           title = "Visualization of Model 1 in Table 4 (DV: Concrete reply)",
           wrap.title = 100)
dev.off()   


## Figure F2 #####
theme_set(theme_sjplot()) # set the format of graph 
# model 2, broad concrete reply as DV
pdf("output/appendix/Figure F2 Visualization of Model 2 in Table 4.pdf", width = 10, height = 7)
plot_model(wuhan2, transform = NULL,
           vline.color = "grey",
           show.values = TRUE, value.offset = .3, value.size = 3,
           dot.size = 0.6,line.size = 0.3,
           width = 0.3,
           title = "Visualization of Model 2 in Table 4 (DV: Broad concrete reply)",
           # terms = c(),
           wrap.title = 100)
dev.off()   


## Figure F3 #####
theme_set(theme_sjplot()) # set the format of graph 
# model 3, action as DV
pdf("output/appendix/Figure F3 Visualization of Model 3 in Table 4.pdf", width = 10, height = 7)
plot_model(wuhan3, transform = NULL,
           vline.color = "grey",
           show.values = TRUE, value.offset = .3, value.size = 3,
           dot.size = 0.6,line.size = 0.3,
           width = 0.3,
           title = "Visualization of Model 3 in Table 4 (DV: Action as reply)",
           # terms = c(),
           wrap.title = 100)
dev.off()   





# ________________ ----
# Robustness on the STM ####

# This needs about 9 hours to run.

# findingk <- stm::searchK(documents = out$documents,
#                     vocab = out$vocab,
#                     K = seq(10,50,5),
#                     init.type = "Spectral",
#                     # proportion = 0.5,
#                     M = 10, # M value for exclusivity computation
#                     heldout.seed = 12345,
#                     prevalence = ~ complaint_re+s(post_month),
#                     verbose=FALSE,
#                     emtol = 1e-3, # emtol = 1e-3 to match the Blei convergence tolerance. This will speed up the calculation.
#                     data = out$meta) 
# 
# save(findingk,file = "data/stm/wuhan_findingk.Rdata")


### Figure F4 ####
load(file = "data/stm/wuhan_findingk.Rdata")
findingk$results

###
#    K  exclus    semcoh   heldout residual     bound    lbound       em.its
# 1 10 9.367329 -87.93003  -7.41859 17.87321 -75316236 -75316221      6
# 2 15 9.528243  -92.1912 -7.351092  15.9631 -74648817 -74648789      7
# 3 20  9.65322 -95.54912 -7.304861 14.71744 -74194951 -74194908      7
# 4 25 9.706964 -100.5934 -7.275854 13.54108 -73857018 -73856960      7
# 5 30 9.724393 -101.0899 -7.252182  12.6515 -73664652 -73664577      6
# 6 35 9.783924 -102.0816  -7.21959 12.17577 -73417806 -73417714      7
# 7 40 9.814343  -104.884 -7.198723 11.73579 -73212086 -73211976      7
# 8 45 9.827836 -107.8892 -7.192052   11.413 -73127613 -73127484      7
# 9 50 9.843654 -111.2419 -7.177124 11.05872 -73006968 -73006820      7

pdf("output/appendix/Figure F4a Wuhan STM findingK.pdf")
stm::plot.searchK(findingk,text.cex = 1)
dev.off()


pdf("output/appendix/Figure F4b Wuhan STM findingK.pdf")
temp = findingk$results
names(temp)
temp = temp |> mutate(
  semantic = semcoh |> as.numeric(),
  exclusivity = exclus |> as.numeric()
)

ggplot(temp, aes(x=semantic, y=exclusivity)) + 
  geom_point() + 
  geom_text(aes(label=K),hjust=0.5, vjust=-0.5) +
  xlab("Semantic Coherence") + ylab("Exclusivity") +
  theme(panel.grid = element_blank(),
        panel.background = element_rect(color = 'black', fill = 'transparent')) +
  geom_segment(aes(x = -95.55, y = 9.60, xend = -95.55, yend = 9.65), 
               arrow = arrow(length = unit(0.3, "cm")), colour = "red")
dev.off()


cat("The end at ", format(Sys.time(), "%d %b %Y %a %X")) 



